Concat the data

- amazon

In [21]:
import pandas as pd

amazon = pd.read_csv('amazon_comment_seaweed_chip_result.csv')  # 아마존 데이터
amazon = amazon.drop(amazon.columns[0], axis = 1)
amazon.head()
Out[21]:
comment_author star_rating comment_date comment_title comment
0 Dmitry 1.0 out of 5 stars Reviewed in the United States on May 4, 2019 This is very dangerous product ! Do not buy ! I've been buying this product for a year, I ...
1 S A 1.0 out of 5 stars Reviewed in the United States on September 17,... Bug eggs in package?!! Thsee don't expire until December. I opened ...
2 Elisa S. 5.0 out of 5 stars Reviewed in the United States on December 17, ... #1 on my list The photo shows the seaweed brands I've trie...
3 Ani_Mon 1.0 out of 5 stars Reviewed in the United States on July 23, 2017 Oily and stale The first time I bought these I fell in love...
4 Rachel Humphrey 5.0 out of 5 stars Reviewed in the United States on February 4, 2017 The BEST seasoned seaweed makers in the WORLD I ate the entire box in a week. This stuff i...
In [22]:
# comment_date 자르기

def name_cut(x):
    x = x.split('on ')[1]
    return x  

amazon["comment_date"] = amazon["comment_date"].apply(name_cut)
amazon.head()
Out[22]:
comment_author star_rating comment_date comment_title comment
0 Dmitry 1.0 out of 5 stars May 4, 2019 This is very dangerous product ! Do not buy ! I've been buying this product for a year, I ...
1 S A 1.0 out of 5 stars September 17, 2018 Bug eggs in package?!! Thsee don't expire until December. I opened ...
2 Elisa S. 5.0 out of 5 stars December 17, 2018 #1 on my list The photo shows the seaweed brands I've trie...
3 Ani_Mon 1.0 out of 5 stars July 23, 2017 Oily and stale The first time I bought these I fell in love...
4 Rachel Humphrey 5.0 out of 5 stars February 4, 2017 The BEST seasoned seaweed makers in the WORLD I ate the entire box in a week. This stuff i...
In [23]:
# 평점 자르기

def name_cut(x):
    x = x.split(" out")[0]
    x = float(x)
    return x  

amazon["star_rating"] = amazon["star_rating"].apply(name_cut)
amazon.head()
Out[23]:
comment_author star_rating comment_date comment_title comment
0 Dmitry 1.0 May 4, 2019 This is very dangerous product ! Do not buy ! I've been buying this product for a year, I ...
1 S A 1.0 September 17, 2018 Bug eggs in package?!! Thsee don't expire until December. I opened ...
2 Elisa S. 5.0 December 17, 2018 #1 on my list The photo shows the seaweed brands I've trie...
3 Ani_Mon 1.0 July 23, 2017 Oily and stale The first time I bought these I fell in love...
4 Rachel Humphrey 5.0 February 4, 2017 The BEST seasoned seaweed makers in the WORLD I ate the entire box in a week. This stuff i...
In [24]:
amazon.to_csv('amazon.csv')

- walmart

In [4]:
walmart = pd.read_csv('walmart.csv')  # 월마트 데이터
walmart.head()
Out[4]:
score title author date comment
0 5 Organic Seaweed Snacks Review Linda M., 14-Jan-20 These seaweed snacks that I have tasted were d...
1 5 I’m hooked! Cherlyn, 13-Mar-20 seen other people buying it. Thought I would t...
2 5 Organic Seaweed Snacks Review Chrystal A., 08-Mar-20 My son loves seaweed but I want to be sure it'...
3 4 Wasabi is best, but these come in 2nd Idk, 07-Jul-20 I haven't tried all the flavors yet but wasabi...
4 4 Organic Seaweed Snacks Review Melissa M., 27-Dec-19 Ocean's Halo seaweed snacks are great snacks t...

- youtube

In [6]:
youtube_1 = pd.read_excel('youtube_script1.xlsx')  # 유투브 데이터 1
youtube_1 = youtube_1.drop(youtube_1.columns[0], axis = 1)
youtube_1.head()
Out[6]:
text start duration
0 (Cheery music) 0.00 1.04
1 Hi everybody. Today I'm going to introduce you... 3.54 6.54
2 and delicious and traditional dish. 10.40 3.30
3 This is called gim-bugak. \nFried seaweed coat... 14.16 6.60
4 When this is fried, really really \ngorgeous a... 21.08 3.92
In [7]:
youtube_2 = pd.read_excel('youtube_script2.xlsx')  # 유투브 데이터 2
youtube_2 = youtube_2.drop(youtube_2.columns[0], axis = 1)
youtube_2.head()
Out[7]:
text start duration
0 so today about how they snack favorite 0.030 6.210
1 unboxing and this month snack fever 3.659 4.561
2 teamed up with any tongue if their 6.240 5.729
3 collaboration and let's just get right 8.220 12.149
4 into it already I see any tons products 11.969 11.511
In [8]:
# 유투브 전체 데이터

youtube = pd.concat([youtube_1['text'], youtube_2['text']], axis = 0) ; youtube
Out[8]:
0                                         (Cheery music)
1      Hi everybody. Today I'm going to introduce you...
2                    and delicious and traditional dish.
3      This is called gim-bugak. \nFried seaweed coat...
4      When this is fried, really really \ngorgeous a...
                             ...                        
126               now this month the website so here and
127                   in the link below I hope I totally
128                      recommend the go to John one so
129              delicious but I hope you guys are doing
130             well and I'll see you guys next time bye
Name: text, Length: 340, dtype: object
In [9]:
youtube = pd.DataFrame({'text': youtube})
youtube.head()
Out[9]:
text
0 (Cheery music)
1 Hi everybody. Today I'm going to introduce you...
2 and delicious and traditional dish.
3 This is called gim-bugak. \nFried seaweed coat...
4 When this is fried, really really \ngorgeous a...

- full data

In [10]:
# 전체 데이터

data = pd.concat([amazon['comment'], walmart['comment'], youtube_1['text'], youtube_2['text']], axis = 0) ; data
Out[10]:
0        I've been buying this product for a year, I ...
1        Thsee don't expire until December. I opened ...
2        The photo shows the seaweed brands I've trie...
3        The first time I bought these I fell in love...
4        I ate the entire box in a week. This stuff i...
                             ...                        
126               now this month the website so here and
127                   in the link below I hope I totally
128                      recommend the go to John one so
129              delicious but I hope you guys are doing
130             well and I'll see you guys next time bye
Length: 6119, dtype: object
In [702]:
data = pd.DataFrame({'comment': data})
data.head()
Out[702]:
comment
0 I've been buying this product for a year, I ...
1 Thsee don't expire until December. I opened ...
2 The photo shows the seaweed brands I've trie...
3 The first time I bought these I fell in love...
4 I ate the entire box in a week. This stuff i...
In [512]:
data.to_csv('full_data.csv')
In [513]:
len(data)
Out[513]:
6119

- amazon + walmart review data

In [11]:
# 아마존 + 월마트

review = pd.concat([amazon['comment'], walmart['comment']], axis = 0) ; review
Out[11]:
0         I've been buying this product for a year, I ...
1         Thsee don't expire until December. I opened ...
2         The photo shows the seaweed brands I've trie...
3         The first time I bought these I fell in love...
4         I ate the entire box in a week. This stuff i...
                              ...                        
1932    I love Asian food and this brand is the best a...
1933    I purchased the Chili Lime. The consistency wa...
1934    I tried Ocean's Halo seaweed snack sheets. I l...
1935    These are a great and healthy option for snack...
1936                                           delicious!
Name: comment, Length: 5779, dtype: object
In [12]:
score = pd.concat([amazon['star_rating'], walmart['score']], axis = 0) ; score
Out[12]:
0       1.0
1       1.0
2       5.0
3       1.0
4       5.0
       ... 
1932    5.0
1933    3.0
1934    5.0
1935    5.0
1936    5.0
Length: 5779, dtype: float64
In [13]:
author = pd.concat([amazon['comment_author'], walmart['author']], axis = 0) ; author
Out[13]:
0                Dmitry
1                   S A
2              Elisa S.
3               Ani_Mon
4       Rachel Humphrey
             ...       
1932          Kiara L.,
1933       Rosemary M.,
1934            deb F.,
1935         Joanna A.,
1936           Garrett,
Length: 5779, dtype: object
In [14]:
date = pd.concat([amazon['comment_date'], walmart['date']], axis = 0) ; date
Out[14]:
0              May 4, 2019
1       September 17, 2018
2        December 17, 2018
3            July 23, 2017
4         February 4, 2017
               ...        
1932             08-Jan-19
1933             20-Nov-18
1934             22-Jan-19
1935             30-Jan-19
1936             08-Aug-17
Length: 5779, dtype: object
In [15]:
title = pd.concat([amazon['comment_title'], walmart['title']], axis = 0) ; title
Out[15]:
0       This is very dangerous product ! Do not buy !
1                              Bug eggs in package?!!
2                                       #1 on my list
3                                      Oily and stale
4       The BEST seasoned seaweed makers in the WORLD
                            ...                      
1932                    Organic Seaweed Snacks Review
1933                    Organic Seaweed Snacks Review
1934                    Organic Seaweed Snacks Review
1935                    Organic Seaweed Snacks Review
1936                                       delicious!
Length: 5779, dtype: object
In [16]:
review = pd.DataFrame({'date': date, 'title' : title, 'review': review, 'score': score, 'author': author})
review.head(15)
Out[16]:
date title review score author
0 May 4, 2019 This is very dangerous product ! Do not buy ! I've been buying this product for a year, I ... 1.0 Dmitry
1 September 17, 2018 Bug eggs in package?!! Thsee don't expire until December. I opened ... 1.0 S A
2 December 17, 2018 #1 on my list The photo shows the seaweed brands I've trie... 5.0 Elisa S.
3 July 23, 2017 Oily and stale The first time I bought these I fell in love... 1.0 Ani_Mon
4 February 4, 2017 The BEST seasoned seaweed makers in the WORLD I ate the entire box in a week. This stuff i... 5.0 Rachel Humphrey
5 June 2, 2017 This batch smelled and tasted like gasoline. This batch smelled and tasted like gasoline.... 1.0 GP
6 October 18, 2016 A lot of seaweed for the price! I bought this seaweed as a snack food not fo... 5.0 Liam Theis
7 April 6, 2017 Good size snack and very healthy. As someone who has never tried seaweed befor... 4.0 okbye
8 January 13, 2020 Sanitation procedures I was okay with this product, the taste was ... 1.0 Donnie
9 December 2, 2017 Light On Salt and Texture I am a fan of seaweed snacks and have tried ... 4.0 benjamin bannister
10 January 17, 2020 Most expensive bag of chips ever. Delicious, but who would pay $16 for a bag o... 4.0 Margaret
11 March 31, 2020 A scam I love these chips but feel ripped off as I ... 1.0 Laura J Armstrong
12 April 20, 2020 "BULK" lol These are the best snacks ever..... However,... 1.0 Stacey
13 May 14, 2020 Misleading and unsatisfied COMPLETE RIP OFF and Misleading. It’s says b... 1.0 Theresa
14 July 8, 2020 Appetite suppressant This is the absolute nastiest thing I’ve eve... 1.0 Calvin Dean
In [17]:
review.to_csv('amazon_walmart_full_data.csv')
In [18]:
len(review)
Out[18]:
5779

Preprocessing

In [519]:
from nltk.corpus import stopwords 
from bs4 import BeautifulSoup 
import re
import numpy as np

#text cleaning
stop_words = set(stopwords.words("english"))

stop_words.update(("love", "loves", "great", "good", "better", "amazing", "buying", "found", "get", "like", "loved", 'maybe', 'may', 'could', 'awesome', 
                   'definitely', 'perfect' , 'best', 'okay', 'excellent', 'disappointed', 'right', 'nice', 'however', 'maybe', 'pretty', 'thank', 'wonderful', 'terrible', 
                   'unfortunately', 'awful', 'horrible', 'worst', 'wont', 'surprised', 'one', 'bad', 'actually', 'really', 'would',
                    'everybody', 'sometime', 'state', 'generally', 'edge', 'report', 'gross'
))

def text_cleaner(text,num):
    newString=str(text).lower()
    newString = BeautifulSoup(newString, "lxml").text
    newString = re.sub(r'\([^)]*\)', '', newString)
    newString = re.sub('"','', newString)
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")])    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 
    newString = re.sub('[m]{2,}', 'mm', newString)
    if(num==0):
        tokens = [w for w in newString.split() if not w in stop_words]
    else:
        tokens=newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:   #removing short word
            long_words.append(i)   
    return (" ".join(long_words)).strip()
In [636]:
contraction_mapping = {"won't": "will not",  "ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                       "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                       "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                       "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                       "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                       "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                       "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                       "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                       "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                       "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                       "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                       "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                       "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                       "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                       "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                       "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                       "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                       "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                       "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                       "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                       "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                       "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                       "you're": "you are", "you've": "you have"}
In [637]:
cleaning_text = []
for r in review['review']:
    cleaning_text.append(text_cleaner(r,0))
In [638]:
review['cleaned_text']=cleaning_text
review['cleaned_text'].head()
Out[638]:
0    product year bought many times definetely give...
1    thsee expire december opened today looks bug e...
2    photo shows seaweed brands tried order prefere...
3    first time bought fell types seaweed snacks tr...
4    ate entire box week stuff delicious enjoy trad...
Name: cleaned_text, dtype: object
In [639]:
cleaning_text = []
for r in youtube['text']:
    cleaning_text.append(text_cleaner(r,0))
In [640]:
youtube['cleaned_text']=cleaning_text
youtube['cleaned_text'].head()
Out[640]:
0                                                    
1          hi today going introduce special beautiful
2                          delicious traditional dish
3    called gim bugak fried seaweed coated rice paste
4                            fried gorgeous beautiful
Name: cleaned_text, dtype: object
In [641]:
#visulazie the noumber of word

import matplotlib.pyplot as plt
#counting
n_of_text = []

for i in review['cleaned_text']:
    n_of_text.append(len(i.split()))
In [642]:
length = pd.DataFrame({'text' : n_of_text})
length.hist(bins = 80)
plt.rc('figure', figsize=(10, 5))
plt.show()

Visualization

In [643]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data = review, x = 'score')
plt.rc('figure', figsize=(10, 5))
plt.show()
In [644]:
review.groupby("score").count()
Out[644]:
review author cleaned_text
score
1.0 596 597 597
2.0 299 299 299
3.0 482 482 482
4.0 934 934 934
5.0 3467 3466 3467
In [645]:
from nltk.probability import FreqDist
fdist = FreqDist(review['cleaned_text'])
print(fdist)

fdist.most_common(2)
fdist.plot(30,cumulative=False)
plt.show()
<FreqDist with 5197 samples and 5779 outcomes>
In [646]:
fdist = FreqDist(youtube['cleaned_text'])
print(fdist)

fdist.most_common(2)
fdist.plot(30,cumulative=False)
plt.show()
<FreqDist with 293 samples and 340 outcomes>
In [647]:
import nltk

plt.figure(figsize=(14,8))

myText = nltk.Text(review['cleaned_text'])

topics = ['snack', 'salty', 'kids', 'healthy snack', 'expensive']
myText.dispersion_plot(topics)
In [648]:
for topic in topics:   
    freqdist = nltk.FreqDist(review['cleaned_text'])
    print(topic,'more :', ' , '.join([ word.lower() for word, count in freqdist.most_common(5)]))
expensive more :  , delicious , taste , yummy , product
healthy snack more :  , delicious , taste , yummy , product
kids more :  , delicious , taste , yummy , product
salty more :  , delicious , taste , yummy , product
snack more :  , delicious , taste , yummy , product
In [649]:
freqdist.plot(10)
Out[649]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a38fb08b0>
In [650]:
tokenized_doc = review['cleaned_text'].apply(lambda x : x.split())
In [651]:
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize

tagged_list = []

for t in tokenized_doc:
    tagged_list.append(pos_tag(t))
In [652]:
pos_tag = pd.DataFrame({'tagged_list': tagged_list})
pos_tag.head()
Out[652]:
tagged_list
0 [(product, NN), (year, NN), (bought, VBD), (ma...
1 [(thsee, NN), (expire, NN), (december, NN), (o...
2 [(photo, NN), (shows, NNS), (seaweed, VBP), (b...
3 [(first, JJ), (time, NN), (bought, VBN), (fell...
4 [(ate, NN), (entire, JJ), (box, NN), (week, NN...
In [653]:
def count_tags(title_with_tags):
    tag_count = {}
    for word, tag in title_with_tags:
        if tag in tag_count:
            tag_count[tag] += 1
        else:
            tag_count[tag] = 1
    return(tag_count)

pos_tag['tagged_list'].map(count_tags).head()
Out[653]:
0    {'NN': 15, 'VBD': 4, 'JJ': 13, 'NNS': 2, 'RB':...
1    {'NN': 7, 'VBD': 1, 'VBZ': 1, 'NNS': 1, 'VBG':...
2    {'NN': 44, 'NNS': 6, 'VBP': 7, 'JJ': 21, 'RB':...
3    {'JJ': 5, 'NN': 10, 'VBN': 2, 'VBD': 2, 'NNS':...
4    {'NN': 19, 'JJ': 14, 'VBP': 6, 'FW': 1, 'NNS':...
Name: tagged_list, dtype: object
In [654]:
pos_tag['tag_counts'] = pos_tag['tagged_list'].map(count_tags)
pos_tag.head()
Out[654]:
tagged_list tag_counts
0 [(product, NN), (year, NN), (bought, VBD), (ma... {'NN': 15, 'VBD': 4, 'JJ': 13, 'NNS': 2, 'RB':...
1 [(thsee, NN), (expire, NN), (december, NN), (o... {'NN': 7, 'VBD': 1, 'VBZ': 1, 'NNS': 1, 'VBG':...
2 [(photo, NN), (shows, NNS), (seaweed, VBP), (b... {'NN': 44, 'NNS': 6, 'VBP': 7, 'JJ': 21, 'RB':...
3 [(first, JJ), (time, NN), (bought, VBN), (fell... {'JJ': 5, 'NN': 10, 'VBN': 2, 'VBD': 2, 'NNS':...
4 [(ate, NN), (entire, JJ), (box, NN), (week, NN... {'NN': 19, 'JJ': 14, 'VBP': 6, 'FW': 1, 'NNS':...
In [655]:
tag_set = list(set([tag for tags in pos_tag['tag_counts'] for tag in tags]))
for tag in tag_set:
    pos_tag[tag] = pos_tag['tag_counts'].map(lambda x: x.get(tag, 0))
title = 'Frequency of POS Tags in Review'    
pos_tag[tag_set].sum().sort_values().plot(kind='barh', logx=True, figsize=(12,10), title=title)
Out[655]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a38ff0a30>
In [656]:
vocabulary = {}
for row in pos_tag['tagged_list']:
    for word, tag in row:
        if word in vocabulary:
            if tag in vocabulary[word]:
                vocabulary[word][tag] += 1
            else:
                vocabulary[word][tag] = 1
        else:
            vocabulary[word] = {tag: 1}
vocabulary_df = pd.DataFrame.from_dict(vocabulary, orient='index')
vocabulary_df.fillna(value=0, inplace=True)
tag = 'NNP' # NNP: Proper noun, singular 
vocabulary_df.sort_values(by=tag, ascending=False).head(10)
Out[656]:
NN VBD VBN RB JJ NNS VBP VB CD VBG ... RBS DT FW NNP WDT UH PRP WRB WP POS
kick 28.0 0.0 0.0 0.0 14.0 1.0 5.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 4.0 0.0 0.0 0.0 0.0 0.0 0.0
know 3.0 0.0 0.0 0.0 20.0 7.0 112.0 33.0 0.0 0.0 ... 0.0 0.0 0.0 3.0 0.0 0.0 0.0 0.0 0.0 0.0
kfordhealth 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
kirkland 8.0 0.0 0.0 0.0 1.0 0.0 0.0 2.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
yummy 115.0 0.0 0.0 21.0 30.0 14.0 14.0 4.0 6.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 4.0 0.0 0.0 0.0
tell 12.0 0.0 0.0 0.0 1.0 0.0 14.0 21.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
kaki 2.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
mart 2.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
quality 94.0 0.0 0.0 0.0 7.0 0.0 4.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0
yolk 3.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0

10 rows × 28 columns

In [657]:
size = 25
tag = 'VBG' # VBG: Verb, gerund or present participle
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
Out[657]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a55c572b0>
In [658]:
size = 25
tag = 'VBD' 
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
Out[658]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a55b93760>
In [659]:
size = 25
tag = 'NN'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
Out[659]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6b11a7feb0>
In [660]:
size = 25
tag = 'NNS' 
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
Out[660]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a78b43310>
In [661]:
size = 25
tag = 'RB' 
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
Out[661]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a73b88eb0>
In [662]:
size = 25
tag = 'RBR' 
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
Out[662]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a55c3f520>
In [663]:
size = 25
tag = 'JJ'
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
Out[663]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a6f3daa00>
In [664]:
size = 25
tag = 'JJR' 
title = 'Top {} Most Frequent Words for {} Tag'.format(size, tag)
vocabulary_df[tag].sort_values().tail(size).plot(kind='barh', figsize=(12,10), title=title)
Out[664]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a6d6c5e50>

word2vec: calculate similarity

In [665]:
# word2vec model

import multiprocessing

# 문자 벡터 차원 수
num_features = 300 
# 최소 문자 수
min_word_count = 10 
# 병렬 처리 스레드 수
num_workers = multiprocessing.cpu_count()
# 문자열 창 크기
context_size = 5 
# seed 값
seed = 1

from gensim.models import word2vec

word2vec_model = word2vec.Word2Vec(tokenized_doc, 
                          seed = seed,
                          workers = num_workers, 
                          size = num_features, 
                          min_count = min_word_count,
                          window = context_size) 
In [666]:
print("word2vec vocabulary length: ", len(word2vec_model.wv.vocab))
word2vec vocabulary length:  1096
In [667]:
# word2vec matrix

from __future__ import absolute_import, division, print_function
import numpy as np

count = 1000
word_vectors_matrix = np.ndarray(shape=(count, 300), dtype='float64')
word_list = []
i = 0
for word in word2vec_model.wv.vocab:
    word_vectors_matrix[i] = word2vec_model.wv[word]
    word_list.append(word)
    i = i+1
    if i == count:
        break
print("word_vectors_matrix shape is ", word_vectors_matrix.shape)
word_vectors_matrix shape is  (1000, 300)
In [668]:
# 2d로 차원 축소

import sklearn.manifold

tsne = sklearn.manifold.TSNE(n_components=2, init = 'pca', n_iter = 3500, random_state=0)
word_vectors_matrix_2d = tsne.fit_transform(word_vectors_matrix)
print("word_vectors_matrix_2d shape is ", word_vectors_matrix_2d.shape)
word_vectors_matrix_2d shape is  (1000, 2)
In [669]:
points = pd.DataFrame(
            [(word, coords[0], coords[1])
            for word, coords in [
                (word, word_vectors_matrix_2d[word_list.index(word)])
                for word in word_list
            ]],
            columns = ['word', 'x', 'y'])
In [670]:
points.head(10)
Out[670]:
word x y
0 product 67.705551 -6.838850
1 year 59.715160 -0.957874
2 bought 62.915512 -5.355459
3 many 66.511620 -9.770054
4 times 58.729416 -1.527508
5 give 65.033379 -6.411435
6 start -4.184552 -6.568685
7 today 2.309242 11.827373
8 enjoying -61.577816 1.059802
9 something 60.241947 4.795998
In [671]:
sns.set_context('poster')
sns.scatterplot("x", "y", s=10, data = points)
Out[671]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f6a55b73670>
In [672]:
def plot_region(x_bounds, y_bounds):
    slice = points[
        (x_bounds[0] <= points['x']) &
        (points['x'] <= x_bounds[1]) &
        (y_bounds[0] <= points['y']) &
        (points['y'] <= y_bounds[1])
    ]
    ax = slice.plot.scatter('x', 'y', s=35, figsize=(30, 28))
    for i, point in slice.iterrows():
        ax.text(point['x'] + 1, point['y'] + 1, point['word'], fontsize=11)
In [673]:
plot_region(x_bounds=(-100, 75), y_bounds=(-20, 20))
In [674]:
plot_region(x_bounds=(-100, -25), y_bounds=(-20, 20))
In [675]:
from sklearn.manifold import TSNE

def display_closestwords_tsnescatterplot(model, word):
    
    arr = np.empty((0,300), dtype='f')
    word_labels = [word]

    # get close words
    close_words = model.wv.similar_by_word(word)
    
    # add the vector for each of the closest words to the array
    arr = np.append(arr, np.array([model.wv[word]]), axis=0)
    for wrd_score in close_words:
        wrd_vector = model.wv[wrd_score[0]]
        word_labels.append(wrd_score[0])
        arr = np.append(arr, np.array([wrd_vector]), axis=0)
        
    # find tsne coords for 2 dimensions
    tsne = TSNE(n_components=2, random_state=0)
    np.set_printoptions(suppress=True)
    Y = tsne.fit_transform(arr)

    x_coords = Y[:, 0]
    y_coords = Y[:, 1]
    # display scatter plot
    plt.scatter(x_coords, y_coords)

    for label, x, y in zip(word_labels, x_coords, y_coords):
        plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')
    plt.xlim(x_coords.min()+0.005, x_coords.max()+0.005)
    plt.ylim(y_coords.min()+0.005, y_coords.max()+0.005)
    plt.rc('figure', figsize=(10, 8))
    plt.show()
In [676]:
display_closestwords_tsnescatterplot(word2vec_model, 'snack')
In [677]:
word2vec_model.wv.most_similar("snack")
Out[677]:
[('light', 0.9969058036804199),
 ('delicious', 0.9968429207801819),
 ('crispy', 0.9967607259750366),
 ('texture', 0.9966163039207458),
 ('salty', 0.9964290261268616),
 ('healthy', 0.9962316155433655),
 ('crunchy', 0.9959913492202759),
 ('favorite', 0.9958543181419373),
 ('enjoyed', 0.995800256729126),
 ('sheets', 0.9955472946166992)]
In [678]:
display_closestwords_tsnescatterplot(word2vec_model, 'healthy')
In [679]:
word2vec_model.wv.most_similar("healthy")
Out[679]:
[('light', 0.9993699193000793),
 ('crunch', 0.9992156028747559),
 ('something', 0.9992151856422424),
 ('kids', 0.9991801977157593),
 ('flavorful', 0.9991520643234253),
 ('crisp', 0.9991462826728821),
 ('craving', 0.999143123626709),
 ('enjoy', 0.9991207122802734),
 ('delicious', 0.9991083145141602),
 ('sushi', 0.9990869164466858)]
In [680]:
display_closestwords_tsnescatterplot(word2vec_model, 'calorie')
In [681]:
word2vec_model.wv.most_similar("calorie")
Out[681]:
[('carb', 0.9998454451560974),
 ('calories', 0.999436616897583),
 ('alternative', 0.9991732239723206),
 ('cal', 0.9989274740219116),
 ('craving', 0.9987910985946655),
 ('tasty', 0.9987858533859253),
 ('chip', 0.9986873865127563),
 ('eating', 0.9986596703529358),
 ('without', 0.9986557960510254),
 ('easy', 0.998652994632721)]
In [682]:
word2vec_model.wv.most_similar("family")
Out[682]:
[('already', 0.9997348785400391),
 ('told', 0.9997129440307617),
 ('choose', 0.9996840953826904),
 ('purchase', 0.9996676445007324),
 ('glad', 0.9996676445007324),
 ('tell', 0.9996556639671326),
 ('honestly', 0.9996547698974609),
 ('interesting', 0.9996476769447327),
 ('unique', 0.9996358752250671),
 ('experience', 0.9996302723884583)]
In [683]:
display_closestwords_tsnescatterplot(word2vec_model, 'seasoning')
In [684]:
word2vec_model.wv.most_similar("seasoning")
Out[684]:
[('sheet', 0.9999055862426758),
 ('also', 0.9998992085456848),
 ('teriyaki', 0.9998957514762878),
 ('tasting', 0.9998893141746521),
 ('anything', 0.9998866319656372),
 ('husband', 0.999886155128479),
 ('think', 0.9998830556869507),
 ('strong', 0.9998790621757507),
 ('kick', 0.9998762011528015),
 ('always', 0.9998735785484314)]
In [685]:
word2vec_model.wv.most_similar("sauce")
Out[685]:
[('soy', 0.9998873472213745),
 ('everything', 0.9998795986175537),
 ('hot', 0.9998794198036194),
 ('top', 0.9998793601989746),
 ('due', 0.9998748302459717),
 ('eaten', 0.9998725056648254),
 ('big', 0.999870240688324),
 ('extra', 0.9998698830604553),
 ('since', 0.9998689293861389),
 ('guess', 0.9998688101768494)]
In [686]:
word2vec_model.wv.most_similar("bland")
Out[686]:
[('type', 0.9999234676361084),
 ('using', 0.9999231696128845),
 ('wanted', 0.9999224543571472),
 ('friend', 0.9999176859855652),
 ('come', 0.9999163746833801),
 ('must', 0.9999098777770996),
 ('care', 0.9999065399169922),
 ('believe', 0.9999063014984131),
 ('might', 0.9999029636383057),
 ('everyone', 0.9999017715454102)]

T-SNE

In [282]:
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import seaborn as sns

df =  review[review['score'] != 3]

df['sentiment'] = df['score'].apply(lambda score: 'positive' if score >3 else 'negative')
df.head()
Out[282]:
review score author cleaned_text sentiment
0 I've been buying this product for a year, I ... 1.0 Dmitry product year bought many times definetely give... negative
1 Thsee don't expire until December. I opened ... 1.0 S A thsee expire december opened today looks bug e... negative
2 The photo shows the seaweed brands I've trie... 5.0 Elisa S. photo shows seaweed brands tried order prefere... positive
3 The first time I bought these I fell in love... 1.0 Ani_Mon first time bought fell types seaweed snacks tr... negative
4 I ate the entire box in a week. This stuff i... 5.0 Rachel Humphrey ate entire box week stuff delicious enjoy trad... positive
In [283]:
display(df['score'].value_counts())
5.0    3466
4.0     934
1.0     596
2.0     299
Name: score, dtype: int64
In [284]:
display(df['sentiment'].value_counts())
positive    4400
negative     895
Name: sentiment, dtype: int64
In [285]:
positive_points = df[df['sentiment'] == 'positive'].sample(n=500)
negative_points = df[df['sentiment'] == 'negative'].sample(n=500)

# Concatenating both of above
total_points = pd.concat([positive_points, negative_points])

count_vect = CountVectorizer(ngram_range=(1,1))
In [286]:
# Initializing vectorizer for bigram
count_vect = CountVectorizer(ngram_range=(1,1))

# Initializing standard scaler
std_scaler = StandardScaler(with_mean=False)

# Creating count vectors and converting into dense representation
sample_points = total_points['cleaned_text']
sample_points = count_vect.fit_transform(sample_points)
sample_points = std_scaler.fit_transform(sample_points)
sample_points = sample_points.todense()

# Storing class label in variable
labels = total_points['sentiment']

# Getting shape
print(sample_points.shape, labels.shape)
(1000, 2782) (1000,)
In [306]:
# TSNE for Word2vec

tsne_data = sample_points
tsne_labels = labels

# Initializing with most explained variance
model = TSNE(n_components=2, random_state=15, perplexity=20, n_iter=2000)

# Fitting model
tsne_data = model.fit_transform(tsne_data)

# Adding labels to the data point
tsne_data = np.vstack((tsne_data.T, tsne_labels)).T

# Creating data frame
tsne_df = pd.DataFrame(data=tsne_data, columns=('Dim_1', 'Dim_2', 'label'))

# Plotting graph for class labels
sns.FacetGrid(tsne_df, hue='label', size=5).map(plt.scatter, 'Dim_1', 'Dim_2').add_legend()
plt.title("TSNE with default parameters")
plt.xlabel("Dim_1")
plt.ylabel("Dim_2")
plt.rc('figure', figsize=(12, 15))
plt.show()

Wordcloud of low-scored & high-scored words using TF-IDF

In [289]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import accuracy_score
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import roc_curve, auc
from sklearn import metrics

review['sentiment'] = review['score'].apply(lambda score: 'positive' if score >3 else 'negative')
review.head()
Out[289]:
review score author cleaned_text sentiment
0 I've been buying this product for a year, I ... 1.0 Dmitry product year bought many times definetely give... negative
1 Thsee don't expire until December. I opened ... 1.0 S A thsee expire december opened today looks bug e... negative
2 The photo shows the seaweed brands I've trie... 5.0 Elisa S. photo shows seaweed brands tried order prefere... positive
3 The first time I bought these I fell in love... 1.0 Ani_Mon first time bought fell types seaweed snacks tr... negative
4 I ate the entire box in a week. This stuff i... 5.0 Rachel Humphrey ate entire box week stuff delicious enjoy trad... positive
In [291]:
positive = review[review['sentiment'] == 'positive']
positive.head()
Out[291]:
review score author cleaned_text sentiment
2 The photo shows the seaweed brands I've trie... 5.0 Elisa S. photo shows seaweed brands tried order prefere... positive
4 I ate the entire box in a week. This stuff i... 5.0 Rachel Humphrey ate entire box week stuff delicious enjoy trad... positive
6 I bought this seaweed as a snack food not fo... 5.0 Liam Theis bought seaweed snack food sushi cannot answer ... positive
7 As someone who has never tried seaweed befor... 4.0 okbye someone never tried seaweed took minute used t... positive
9 I am a fan of seaweed snacks and have tried ... 4.0 benjamin bannister fan seaweed snacks tried many package terms ov... positive
In [292]:
negative = review[review['sentiment'] == 'positive']
negative.head()
Out[292]:
review score author cleaned_text sentiment
2 The photo shows the seaweed brands I've trie... 5.0 Elisa S. photo shows seaweed brands tried order prefere... positive
4 I ate the entire box in a week. This stuff i... 5.0 Rachel Humphrey ate entire box week stuff delicious enjoy trad... positive
6 I bought this seaweed as a snack food not fo... 5.0 Liam Theis bought seaweed snack food sushi cannot answer ... positive
7 As someone who has never tried seaweed befor... 4.0 okbye someone never tried seaweed took minute used t... positive
9 I am a fan of seaweed snacks and have tried ... 4.0 benjamin bannister fan seaweed snacks tried many package terms ov... positive
In [293]:
positive.to_csv('postive.csv')
negative.to_csv('negative.csv')
In [294]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

stopwords = set(STOPWORDS) 
stopwords.add('like')
stopwords.add('product')
stopwords.add('bought')
stopwords.add('okay')
stopwords.add('buy')
stopwords.add('ok')
stopwords.add('love')
stopwords.add('enjoy')
stopwords.add('buying')
stopwords.add('better')
stopwords.add('great')
stopwords.add('good')

def show_wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='black',
        stopwords=stopwords,
        max_words=200,
        max_font_size=40, 
        scale=3,
        collocations=False,
        random_state=1 # chosen at random by flipping a coin; it was heads
    ).generate(str(data))
    
    fig = plt.figure(1, figsize=(8, 8))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()
In [295]:
show_wordcloud(review[review.score == 5]["review"], title = "High scored words")
In [296]:
show_wordcloud(review[review.score == 1]["review"], title = "Low scored words")

LDA: 토픽 모델링

In [297]:
from gensim import corpora 
from gensim import models
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS

dictionary = corpora.Dictionary(tokenized_doc) 

corpus = [dictionary.doc2bow(text) for text in tokenized_doc]  # bag of words (단어가 있나 - 1, 없나 - 0)
lda = models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, # 벡터화한 단어와 단어 사전 압력
        num_topics=2, random_state = 1) 

for t in lda.show_topics():
    print(t)
(0, '0.026*"flavor" + 0.021*"seaweed" + 0.016*"snacks" + 0.013*"try" + 0.011*"snack" + 0.010*"tried" + 0.007*"recommend" + 0.006*"flavors" + 0.006*"product" + 0.006*"taste"')
(1, '0.031*"seaweed" + 0.030*"snack" + 0.020*"taste" + 0.018*"snacks" + 0.017*"flavor" + 0.014*"try" + 0.013*"salt" + 0.013*"sea" + 0.012*"product" + 0.011*"healthy"')
In [298]:
wc = WordCloud(background_color='black')

plt.figure(figsize=(30,30))
for t in range(lda.num_topics):
    plt.subplot(5,4,t+1)
    x = dict(lda.show_topic(t,200))
    im = wc.generate_from_frequencies(x)
    plt.imshow(im)
    plt.axis("off")
    plt.title("Topic #" + str(t))
    
plt.show()
In [299]:
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda, corpus, dictionary)
pyLDAvis.display(vis)
Out[299]:

Sentiment analysis: logistic regression

In [579]:
#drop empty rows
review.replace(' ',np.nan , inplace = True) #filling the space with nan value and then remove it
review.dropna(axis = 0 , inplace = True)

# df =  review[review['score'] != 3]
df = review
X = df['cleaned_text']
y_dict = {1.0:0, 2.0:0, 3.0:1, 4.0:1, 5.0:1}
y = df['score'].map(y_dict)
In [580]:
import seaborn as sns

sns.countplot(y)
plt.rc('figure', figsize=(5, 2))
plt.show()
In [581]:
# logistic regression

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.dummy import DummyClassifier
import warnings
 
warnings.filterwarnings("ignore")

c = CountVectorizer(stop_words = 'english')

def text_fit(X, y, model,clf_model,coef_show=1):
    
    X_c = model.fit_transform(X)
    print('# features: {}'.format(X_c.shape[1]))
    X_train, X_test, y_train, y_test = train_test_split(X_c, y, random_state=0)
    print('# train records: {}'.format(X_train.shape[0]))
    print('# test records: {}'.format(X_test.shape[0]))
    clf = clf_model.fit(X_train, y_train)
    acc = clf.score(X_test, y_test)
    print ('Model Accuracy: {}'.format(acc))
    
    if coef_show == 1: 
        w = model.get_feature_names()
        coef = clf.coef_.tolist()[0]
        coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
        coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
        print('')
        print('-Top 20 positive-')
        print(coeff_df.head(20).to_string(index=False))
        print('')
        print('-Top 20 negative-')        
        print(coeff_df.tail(20).to_string(index=False))
    
    
text_fit(X, y, c, LogisticRegression())
# features: 5936
# train records: 4332
# test records: 1445
Model Accuracy: 0.8740484429065744

-Top 20 positive-
      Word  Coefficient
     yummy     1.618764
 delicious     1.616958
      fast     1.416544
    hooked     1.335646
  calories     1.322125
  favorite     1.300244
     tasty     1.292989
  addicted     1.261317
    packed     1.255285
      wait     1.213037
 addictive     1.180017
       bbq     1.145611
      easy     1.125680
       yum     1.124933
  daughter     1.112055
      come     1.105126
       son     1.063495
  snacking     1.056147
      stop     1.046596
      lime     1.040983

-Top 20 negative-
          Word  Coefficient
         bland    -1.117750
         stuck    -1.141845
         threw    -1.158443
    apparently    -1.205280
       ordered    -1.206752
          yuck    -1.212022
           box    -1.216351
           fan    -1.237366
      received    -1.280953
 disappointing    -1.301283
        opened    -1.312999
         nasty    -1.320738
     cardboard    -1.386859
          fish    -1.389503
    flavorless    -1.433581
        crumbs    -1.454934
         stale    -1.462249
         paper    -1.579028
         sorry    -1.816366
    disgusting    -2.261907
In [582]:
text_fit(X, y, c, DummyClassifier(),0)
# features: 5936
# train records: 4332
# test records: 1445
Model Accuracy: 0.7280276816608997
In [583]:
tfidf_n = TfidfVectorizer(ngram_range=(1,2),stop_words = stop_words)
text_fit(X, y, tfidf_n, LogisticRegression())
# features: 54535
# train records: 4332
# test records: 1445
Model Accuracy: 0.829757785467128

-Top 20 positive-
      Word  Coefficient
     snack     3.192491
 delicious     2.890057
     tasty     2.249734
     yummy     1.925861
   flavors     1.918146
  favorite     1.654505
      kids     1.645281
   healthy     1.617553
    snacks     1.440905
    little     1.425731
     price     1.295359
  calories     1.262498
      easy     1.145646
   crunchy     1.114523
       yum     1.068801
     light     1.055100
  daughter     1.045403
   seaweed     1.032171
 addictive     1.009821
     spicy     0.997669

-Top 20 negative-
       Word  Coefficient
      smell    -1.372884
       date    -1.384419
      sorry    -1.402299
     return    -1.445380
  cardboard    -1.447297
        fan    -1.451646
      fishy    -1.573772
      threw    -1.610983
     opened    -1.643096
      salty    -1.705088
      nasty    -1.708335
      money    -1.776613
      boxes    -1.798282
     tasted    -1.975024
   received    -2.000370
 disgusting    -2.098168
    ordered    -2.151926
       fish    -2.303446
        box    -2.907102
      stale    -3.161736

Sentiment analysis: logistic regression vs. Naive Bayes

In [604]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split

review['sentiment'] = review['score'].apply(lambda score: 'positive' if score >= 3 else 'negative')

train, test = train_test_split(review, test_size=0.2)

countVector = CountVectorizer(min_df = 1, ngram_range = (1, 4))
X_train_counts = countVector.fit_transform(train["cleaned_text"])

#applying tfidf to term frequency
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

X_new_counts = countVector.transform(test["cleaned_text"])
X_test_tfidf = tfidf_transformer.transform(X_new_counts)

y_train = train["sentiment"]
y_test = test["sentiment"]

prediction = dict()
In [605]:
# Multinomial Naïve Bayes# Naïve Bayes

model = MultinomialNB().fit(X_train_tfidf, y_train)
prediction['Multinomial'] = model.predict(X_test_tfidf)
In [606]:
# Bernoulli Naïve Bayes 

model = BernoulliNB().fit(X_train_tfidf, y_train)
prediction['Bernoulli'] = model.predict(X_test_tfidf)
In [607]:
# Logistic regression

logreg = LogisticRegression(C=1e5)
logreg_result = logreg.fit(X_train_tfidf, y_train)
prediction['Logistic'] = logreg.predict(X_test_tfidf)
In [608]:
def formatt(x):
    if x == 'negative':
        return 0
    return 1
vfunc = np.vectorize(formatt)

cmp = 0
colors = ['b', 'g', 'y', 'm', 'k']
for model, predicted in prediction.items():
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test.map(formatt), vfunc(predicted))
    roc_auc = auc(false_positive_rate, true_positive_rate)
    plt.plot(false_positive_rate, true_positive_rate, colors[cmp], label='%s: AUC %0.2f'% (model,roc_auc))
    cmp += 1

plt.title('Classifiers comparaison with ROC')
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.rc('figure', figsize=(20, 12))
plt.show()
In [609]:
print(metrics.classification_report(y_test, prediction['Logistic'], target_names = ["positive", "negative"]))
              precision    recall  f1-score   support

    positive       0.79      0.51      0.62       185
    negative       0.91      0.97      0.94       971

    accuracy                           0.90      1156
   macro avg       0.85      0.74      0.78      1156
weighted avg       0.89      0.90      0.89      1156

In [610]:
accuracy_score(y_test, prediction['Logistic'])
Out[610]:
0.9005190311418685
In [611]:
features = countVector.get_feature_names()
feature_coefs = pd.DataFrame(
    data = list(zip(features, logreg_result.coef_[0])),
    columns = ['feature', 'coefficient'])

feature_coefs.sort_values(by='coefficient')[:30]
Out[611]:
feature coefficient
128607 tasting snack -53.627036
127709 tasted burned -47.676470
148040 yuck -45.564724
12226 box -37.899573
31288 disgusting -37.720543
76047 nasty -36.252600
96017 received -35.212989
12575 boxes -31.565544
132972 threw -29.927223
120292 stale -28.502065
43861 fishy -28.231357
118782 sorry -27.162778
81347 opened -26.938524
43582 fish -26.509283
39197 expired -25.164672
127784 tasted even -25.159911
25865 crumbs -24.692759
27281 date -23.463928
50657 garbage -22.770953
35882 enjoyed things -22.531105
98107 return -21.852363
12258 box came -21.232014
7232 away -21.169321
112171 smell -20.797856
91758 product expected -20.652902
76928 never -20.347423
103429 says -19.866711
17156 cardboard -19.825669
82090 ordered -19.657242
44679 flavor buy -19.308840

keyword analysis using network model

In [612]:
import networkx as nx
import nltk

tagged_sents = [nltk.pos_tag(sentence) for sentence in tokenized_doc]
In [613]:
import re
for token, tag in tagged_sents[0]:
    if re.match(r'NN*|JJ*', tag):
        print (token, tag)
product NN
year NN
many JJ
times NNS
start NN
today NN
something NN
sharp JJ
stack JJ
deep JJ
tongue JJ
piece NN
thin NN
inch JJ
length NN
seaweed JJ
sheet NN
fish JJ
bone NN
ut JJ
wire NN
death NN
happen JJ
thing NN
dangerous JJ
image NN
give JJ
kids NNS
dangerous JJ
product NN
In [614]:
noun_phrases = [[token for token, tag in sent if re.match(r'NN*|JJ*', tag)] 
                for sent in tagged_sents]
In [615]:
import itertools as it
edgelist = [edge for phrase in noun_phrases for edge in it.combinations(phrase, 2)]
In [616]:
G = nx.Graph(edgelist)
index = nx.betweenness_centrality(G)

sorted_index = sorted(index.items(), key=lambda x:x[1], reverse=True)

# Top 10 noun phrases by betweenness centrality:
for word, centr in sorted_index[:10]:
        print (word, centr)
taste 0.06386158539190173
snack 0.05929117673139671
flavor 0.04883720265858079
seaweed 0.04282928094228625
snacks 0.0422076130930521
product 0.03597867746612883
delicious 0.021505753661634347
eat 0.020739311398279364
little 0.01846788798825126
healthy 0.01656901289275728
In [617]:
G.size()
Out[617]:
210961
In [621]:
%pylab inline
%config InlineBackend.figure_format = 'png'
plt.rc('figure', figsize=(30, 25))
G.remove_nodes_from([n for n in index if index[n] == .0])
node_size = [index[n]*10000 for n in G]
pos = nx.spring_layout(G)
nx.draw_networkx(G, pos, node_size=node_size, node_color='#A0CBE2', edge_color='white', alpha=.5, linewidths=15)
Populating the interactive namespace from numpy and matplotlib

Sentiment anlysis of 5 classes

In [323]:
df = pd.read_csv('amazon_comment_bugak_result.csv')
df = df.loc[:,['comment', 'star_rating']]
df['star_rating'] = df["star_rating"].apply(name_cut)
df.head()
Out[323]:
comment star_rating
0 Your label here on Amazon does not State any... 1.0
1 If I could give this zero stars, I would. To... 1.0
2 The taste was not bad, but i gave it 2 stars... 2.0
3 If I knew what the disclaimer was on the sid... 1.0
4 Like an idiot I looked at the reviews, but d... 1.0
In [324]:
sns.countplot(data = df, x = 'star_rating')
plt.rc('figure', figsize=(12, 7))
plt.show()
In [325]:
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag

lmtzr = WordNetLemmatizer()
negation = re.compile(r"(?:^(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)$)|n't",re.I)
clp = re.compile(r"^[.:;!?]$",re.I)

def extract_words_from_comments(df):
    comments_tok = []
    for index, datapoint in df.iterrows():
        tokenized_words = word_tokenize(datapoint["comment"].lower(),language='english')
        pos_tagged_words = pos_tag(tokenized_words)
        tokenized_words = ["_".join([lmtzr.lemmatize(i[0]),i[1]]) for i in pos_tagged_words if (i[0] not in stopwords.words("english") and len(i[0]) > 2)]
        comments_tok.append(tokenized_words)
    df["comment_tok"] = comments_tok
    return df

df = extract_words_from_comments(df)
print (df.head())
print (df.shape)
                                             comment  star_rating  \
0    Your label here on Amazon does not State any...          1.0   
1    If I could give this zero stars, I would. To...          1.0   
2    The taste was not bad, but i gave it 2 stars...          2.0   
3    If I knew what the disclaimer was on the sid...          1.0   
4    Like an idiot I looked at the reviews, but d...          1.0   

                                         comment_tok  
0  [label_NN, amazon_NN, state_NN, anything_NN, l...  
1  [could_MD, give_VB, zero_CD, star_NNS, would_M...  
2  [taste_NN, bad_JJ, gave_VBD, star_NNS, warning...  
3  [knew_VBD, disclaimer_NN, side_NN, bottle_NN, ...  
4  [like_IN, idiot_NN, looked_VBD, review_NNS, sw...  
(596, 3)
In [326]:
from gensim import matutils, corpora, models

def vectorize_comments(df):
    d = corpora.Dictionary(df['comment_tok'])
    d.filter_extremes(no_below=2, no_above=0.8)
    d.compactify()
    corpus = [d.doc2bow(text) for text in df['comment_tok']]
    corpus = matutils.corpus2csc(corpus, num_terms = len(d.token2id))
    corpus = corpus.transpose()
    return d, corpus

dictionary, corpus = vectorize_comments(df)
print(corpus.shape)
(596, 853)
In [327]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier as RFC
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import pickle

def train_classifier(X,y):
    n_estimators = [100]
    min_samples_split = [2]
    min_samples_leaf = [1]
    bootstrap = [True]

    parameters = {'n_estimators': n_estimators, 'min_samples_leaf': min_samples_leaf,
                  'min_samples_split': min_samples_split}

    clf = GridSearchCV(RFC(verbose=1,n_jobs=4), cv=4, param_grid=parameters)
    clf.fit(X, y)
    return clf

X_train, X_test, y_train, y_test = train_test_split(corpus, df["star_rating"], random_state=0)
classifier = train_classifier(X_train,y_train)
print (classifier.best_score_, "----------------Best Accuracy score on Cross Validation Sets")
print (classifier.score(X_test,y_test))
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.6s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
0.7494168275418275 ----------------Best Accuracy score on Cross Validation Sets
0.7785234899328859
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished
In [328]:
f = open("Output.txt","w")
f.write("Best Accuracy score on Cross Validation Sets %f" %classifier.best_score_,)
f.write("Score on Test Set %f" %classifier.score(X_test,y_test))
f.close()
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished

gender identification by name using gender-guesser

In [329]:
import gender_guesser.detector as gender

g = gender.Detector()

print(g.get_gender(u"Bob"))
male
In [330]:
review['author']
Out[330]:
0                Dmitry
1                   S A
2              Elisa S.
3               Ani_Mon
4       Rachel Humphrey
             ...       
1932          Kiara L.,
1933       Rosemary M.,
1934            deb F.,
1935         Joanna A.,
1936           Garrett,
Name: author, Length: 5777, dtype: object
In [333]:
gender = []

for a in review['author']:
    gender.append(g.get_gender(a))  
In [334]:
while 'unknown' in gender:
    gender.remove('unknown')
In [340]:
sns.countplot(gender)
plt.rc('figure', figsize=(14, 10))
plt.show()

gender identification by name using gender-predictor

In [341]:
!pip install git+git://github.com/clintval/gender_predictor.git
Looking in indexes: http://ftp.daumkakao.com/pypi/simple
Collecting git+git://github.com/clintval/gender_predictor.git
  Cloning git://github.com/clintval/gender_predictor.git to /tmp/pip-req-build-7emdr6kt
Requirement already satisfied (use --upgrade to upgrade): gender-predictor==0.1 from git+git://github.com/clintval/gender_predictor.git in /home/jeehyun.lee.conda/anaconda3/envs/kubig-venv/lib/python3.8/site-packages
Requirement already satisfied: nltk in /home/jeehyun.lee.conda/anaconda3/envs/kubig-venv/lib/python3.8/site-packages (from gender-predictor==0.1) (3.5)
Requirement already satisfied: click in /home/jeehyun.lee.conda/anaconda3/envs/kubig-venv/lib/python3.8/site-packages (from nltk->gender-predictor==0.1) (7.1.2)
Requirement already satisfied: joblib in /home/jeehyun.lee.conda/anaconda3/envs/kubig-venv/lib/python3.8/site-packages (from nltk->gender-predictor==0.1) (0.14.1)
Requirement already satisfied: tqdm in /home/jeehyun.lee.conda/anaconda3/envs/kubig-venv/lib/python3.8/site-packages (from nltk->gender-predictor==0.1) (4.46.1)
Requirement already satisfied: regex in /home/jeehyun.lee.conda/anaconda3/envs/kubig-venv/lib/python3.8/site-packages (from nltk->gender-predictor==0.1) (2020.7.14)
Building wheels for collected packages: gender-predictor
  Building wheel for gender-predictor (setup.py) ... done
  Created wheel for gender-predictor: filename=gender_predictor-0.1-py3-none-any.whl size=3299 sha256=2d04188af83903420ffaceba01b7d4d8f0e9804faf5b570e1c273599e3b4a5ca
  Stored in directory: /tmp/pip-ephem-wheel-cache-_yxl7qjm/wheels/30/a9/a7/26dff4bb13fd58d960daad6e626d2605d7a813ccad8c2745b5
Successfully built gender-predictor
In [342]:
from gender_predictor import GenderPredictor

gp = GenderPredictor()
gp.train_and_test()
gp.classify('Aldo')
import complete
32,031 male names
56,347 female names
classifier accuracy: 96.88%
Out[342]:
'M'
In [343]:
gender = []

for ca in review['author']:
    gender.append(gp.classify(ca))  
In [344]:
sns.countplot(gender)
plt.rc('figure', figsize=(12, 7))
plt.show()

Time-series graph using comment_date

In [347]:
amazon['comment_date'][2].split(' ')[0]
Out[347]:
'December'
In [348]:
amazon['comment_date'][2].split(' ')[2]
Out[348]:
'2018'
In [353]:
# month

def name_cut(x):
    x = x.split(' ')[0]
    return x  

amazon["month"] = amazon["comment_date"].apply(name_cut)
amazon.head()
Out[353]:
comment_author star_rating comment_date comment_title comment month year
0 Dmitry 1.0 May 4, 2019 This is very dangerous product ! Do not buy ! I've been buying this product for a year, I ... May 2019
1 S A 1.0 September 17, 2018 Bug eggs in package?!! Thsee don't expire until December. I opened ... September 2018
2 Elisa S. 5.0 December 17, 2018 #1 on my list The photo shows the seaweed brands I've trie... December 2018
3 Ani_Mon 1.0 July 23, 2017 Oily and stale The first time I bought these I fell in love... July 2017
4 Rachel Humphrey 5.0 February 4, 2017 The BEST seasoned seaweed makers in the WORLD I ate the entire box in a week. This stuff i... February 2017
In [354]:
# year

def name_cut(x):
    x = x.split(' ')[2]
    return x  

amazon["year"] = amazon["comment_date"].apply(name_cut)
amazon.head()
Out[354]:
comment_author star_rating comment_date comment_title comment month year
0 Dmitry 1.0 May 4, 2019 This is very dangerous product ! Do not buy ! I've been buying this product for a year, I ... May 2019
1 S A 1.0 September 17, 2018 Bug eggs in package?!! Thsee don't expire until December. I opened ... September 2018
2 Elisa S. 5.0 December 17, 2018 #1 on my list The photo shows the seaweed brands I've trie... December 2018
3 Ani_Mon 1.0 July 23, 2017 Oily and stale The first time I bought these I fell in love... July 2017
4 Rachel Humphrey 5.0 February 4, 2017 The BEST seasoned seaweed makers in the WORLD I ate the entire box in a week. This stuff i... February 2017
In [351]:
import datetime

timeStr = '2018-07-28 12:11:32'
Thistime = datetime.datetime.strptime(timeStr, '%Y-%m-%d %H:%M:%S')

print(Thistime)
2018-07-28 12:11:32
In [ ]:
import matplotlib.patches as mpatches

plt.title('comment_date')
sns.pointplot(x='comment_date', y='comment', data = amazon, color = 'skyblue', label = 'positive')
positive = mpatches.Patch(color='skyblue', label='positive')
plt.legend(handles = [positive])

sns.pointplot(x='month', y='predict', data = amazon, color = 'orange', label = 'negative')
negative = mpatches.Patch(color='orange', label='negative')
plt.legend(handles = [positive, negative])

plt.ylim(300, 750)